In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
%matplotlib inline
In [2]:
sales = pd.read_csv('home_data.csv')
In [3]:
train_data = sales.sample(frac=0.8)
In [4]:
from sklearn.cross_validation import train_test_split
test_data, train_data = train_test_split(sales, test_size=0.8, random_state=42)
In [5]:
len(test_data)
Out[5]:
In [6]:
len(train_data)
Out[6]:
In [7]:
sales.describe()
Out[7]:
In [8]:
sales['bedrooms'].unique()
Out[8]:
In [9]:
sales.count()
Out[9]:
In [10]:
sales['renovated'] = sales['yr_renovated'] > 0
In [11]:
sales.head()
Out[11]:
In [12]:
sales.plot.scatter(x='sqft_living', y='price')
Out[12]:
In [13]:
sales.index
Out[13]:
In [14]:
sales.columns
Out[14]:
In [15]:
sales.values
Out[15]:
In [16]:
sales.index
Out[16]:
In [19]:
bed_count = sales.bedrooms.value_counts()
In [20]:
type(bed_count)
Out[20]:
In [21]:
bed_count
Out[21]:
In [22]:
bed_count.sort_index().plot()
Out[22]:
In [24]:
sales.ix[10]
Out[24]:
In [54]:
regr_one_feature = linear_model.LinearRegression()
In [55]:
training_data_features = train_data["sqft_living"].values
In [56]:
training_data_features.shape
Out[56]:
In [70]:
training_data_features = np.array([train_data.sqft_living]).T
In [71]:
training_data_features.shape
Out[71]:
In [72]:
training_data_targets = train_data["price"].values
In [73]:
training_data_targets.shape
Out[73]:
In [74]:
training_data_targets = np.array([train_data.price]).T
In [75]:
training_data_targets.shape
Out[75]:
In [76]:
regr_one_feature.fit(training_data_features, training_data_targets)
Out[76]:
In [78]:
regr_one_feature.coef_
Out[78]:
In [79]:
# Make predictions using the model and the data we set aside
test_data_features = np.array([test_data.sqft_living]).T
test_data_targets = np.array([test_data.price]).T
In [80]:
# Let's look at the variance (1 is a perfect prediction)
regr_one_feature.score(test_data_features, test_data_targets)
Out[80]:
In [93]:
import math
np.mean((regr_one_feature.predict(test_data_features) - test_data_targets) ** 2)
Out[93]:
In [90]:
plt.scatter(test_data_features, test_data_targets, color='blue')
plt.plot(test_data_features, regr_one_feature.predict(test_data_features), color='red', linewidth=3)
Out[90]:
In [91]:
from sklearn.metrics import mean_squared_error # Same as the computation above the plot
mean_squared_error(test_data_targets, regr_one_feature.predict(test_data_features))
Out[91]:
In [94]:
regr_one_feature.coef_
Out[94]:
In [96]:
regr_one_feature.intercept_
Out[96]:
In [ ]: